# Cargando datos
load("../data/my_train1.Rdata")
load("../data/my_test1.Rdata")
sampleSub <- read_csv("../data/sample_submission.csv")
head(new_train1)# Definiendo tema y colores para gráficos
theme_set(theme_bw())
colores <- c("#5B6DC8", "#D33B44")
mi_train %>%
select_if(is.numeric) %>%
select(-new_year) %>%
mutate(Rating = as.factor(Rating)) %>%
pivot_longer(cols = !Rating, names_to = "variable", values_to = "valores") %>%
ggplot(aes(x = valores, fill = Rating, color = Rating)) +
facet_wrap(~variable, scales = "free", ncol = 3, nrow = 3) +
geom_density(alpha = 0.4) +
scale_color_manual(values = colores) +
scale_fill_manual(values = colores) +
scale_x_log10() +
labs(title = "Escala logarítmica")library(corrr)
mi_train %>%
select_if(is.numeric) %>%
select(-new_year, -Rating) %>%
mutate_if(is.numeric, log) %>%
correlate(method = "spearman") %>%
rearrange() %>%
shave() %>%
rplot(print_cor = TRUE) +
theme(axis.text.x = element_text(angle = 35, hjust = 1)) mi_train %>%
mutate(Rating = as.factor(Rating)) %>%
ggplot(aes(x = Reviews, y = new_installs, color = Rating)) +
geom_point(size = 1) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(se = FALSE, size = 1) +
scale_color_manual(values = colores) +
labs(x = "Reseñas", y = "Descargas",
title = "Descargas vs Reseñas",
subtitle = "Escala logarítmica")mi_train %>%
mutate(Rating = as.factor(Rating)) %>%
ggplot(aes(x = size_kb, y = Reviews, color = Rating)) +
geom_point(size = 1, alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(se = FALSE, size = 1.5) +
scale_color_manual(values = colores) +
labs(x = "Tamaño de App", y = "Reseñas",
title = "Reseñas vs Tamaño de App",
subtitle = "Escala logarítmica")mi_train %>%
mutate(Rating = as.factor(Rating)) %>%
ggplot(aes(x = new_version, y = Reviews, color = Rating)) +
geom_point(size = 1, alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(se = FALSE, size = 1.5) +
scale_color_manual(values = colores) +
labs(x = "Reseñas", y = "Versión de App",
title = "Reseñas vs Versión de App",
subtitle = "Escala logarítmica")mi_train %>%
mutate(Rating = as.factor(Rating)) %>%
ggplot(aes(x = new_subversion, y = Reviews, color = Rating)) +
geom_point(size = 1, alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(se = FALSE, size = 1.5) +
scale_color_manual(values = colores) +
labs(x = "Reseñas", y = "Sub-versión de App",
title = "Reseñas vs Versión de App",
subtitle = "Escala logarítmica")# "Receta" y "preparación" para PCA y UMAP
receta_reductDim <- mi_train %>%
select_if(is.numeric) %>%
select(-new_year) %>%
recipe(~ .) %>%
update_role(Rating, new_role = "id") %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
pca_prep <- receta_reductDim %>% prep()
pca_prepData Recipe
Inputs:
Training data contained 5788 data points and 2060 incomplete rows.
Operations:
K-nearest neighbor imputation for new_day, size_kb, new_installs, new_price, new_month_num, ... [trained]
Box-Cox transformation on new_day, size_kb, new_month_num, min_android [trained]
Centering and scaling for Reviews, new_day, size_kb, new_installs, new_price, ... [trained]
PCA extraction with Reviews, new_day, size_kb, new_installs, new_price, ... [trained]
# Cargas (loadings)
tidy_pca <- tidy(pca_prep, 4)
tidy_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
ggplot(aes(x = value, y = terms, fill = value)) +
facet_wrap(~component, ncol = 5) +
geom_col(show.legend = FALSE)juice(pca_prep) %>%
ggplot(aes(x = PC1, y = PC2, color = as.factor(Rating))) +
geom_point() +
geom_vline(xintercept = 0, lty = 2, lwd = 0.1) +
geom_hline(yintercept = 0, lty = 2, lwd = 0.1) +
scale_color_manual(values = colores)library(plotly)
plot_ly(x = ~ PC1, y = ~ PC2, z = ~PC3, data = juice(pca_prep),
color = ~as.factor(Rating)) %>%
add_markers()minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
library(embed)
# "Receta" y "preparación" UMAP
receta_umap <- mi_train %>%
select_if(is.numeric) %>%
select(-new_year) %>%
recipe(~ .) %>%
update_role(Rating, new_role = "id") %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
umap_prep <- receta_umap %>% prep()
umap_prepData Recipe
Inputs:
Training data contained 5788 data points and 2060 incomplete rows.
Operations:
K-nearest neighbor imputation for new_day, size_kb, new_installs, ... [trained]
Box-Cox transformation on new_day, size_kb, new_month_num, min_android [trained]
Centering and scaling for Reviews, new_day, size_kb, new_installs, ... [trained]
UMAP embedding for Reviews, new_day, size_kb, new_installs, ... [trained]
juice(umap_prep) %>%
ggplot(aes(x = umap_1, y = umap_2, color = as.factor(Rating))) +
geom_point() +
geom_vline(xintercept = 0, lty = 2, lwd = 0.1) +
geom_hline(yintercept = 0, lty = 2, lwd = 0.1) +
scale_color_manual(values = colores)mi_train %>%
mutate(Rating = as.factor(Rating)) %>%
count(Rating) %>%
ggplot(aes(x = Rating, y = n, fill = Rating, color = Rating,
label = n)) +
geom_col() +
geom_label(color = "white") +
scale_color_manual(values = colores) +
scale_fill_manual(values = colores) +
theme(legend.position = "none")library(themis)
# Train-Test
set.seed(2020)
data_split <- initial_split(data = mi_train, prop = 0.80, strata = Type)
data_train <- training(data_split) %>% mutate(Rating = as.factor(Rating))
data_test <- testing(data_split) %>% mutate(Rating = as.factor(Rating))
# Receta
receta1 <- recipe(Rating ~ ., data = data_train) %>%
step_knnimpute(all_predictors(), neighbors = 5) %>%
step_BoxCox(all_numeric(), -all_outcomes()) %>%
step_other(all_nominal(), -all_outcomes(), other = "otra") %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_upsample(Rating)
receta1_prep <- receta1 %>%
prep()receta_sub <- recipe(~ ., data = mi_test) %>%
step_knnimpute(all_predictors(), neighbors = 5) %>%
step_BoxCox(all_numeric(), -all_outcomes()) %>%
step_other(all_nominal(), -all_outcomes(), other = "otra") %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_numeric(), -all_outcomes())
prep_sub <- prep(receta_sub)
data_sub <- juice(prep_sub)lasso1 <- logistic_reg(penalty = 0.1, mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
wf1 <- workflow() %>%
add_recipe(receta1)
res_lasso1 <- wf1 %>%
add_model(lasso1) %>%
fit(data = data_train)
res_lasso1 %>%
tidy()set.seed(2020)
data_boost <- bootstraps(data_train, strata = Type, times = 10)
tune_lasso <- logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
mi_grid <- grid_regular(penalty(),
mixture(),
levels = 50)
doParallel::registerDoParallel()
set.seed(1992)
lasso_grid <- tune_grid(
wf1 %>% add_model(tune_lasso),
resamples = data_boost,
grid = mi_grid
)
doParallel::stopImplicitCluster()lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_line() +
geom_smooth(se = FALSE) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10() +
scale_color_manual(values = colores) +
theme(legend.position = "none")lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = mixture, y = mean, color = .metric)) +
geom_line() +
geom_smooth(se = FALSE) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10() +
scale_color_manual(values = colores) +
theme(legend.position = "none")library(vip)
# Flujo final
modelo_final <- finalize_workflow(wf1 %>% add_model(tune_lasso),
parameters = mejor_tuning)
# Ajuste final
modelo_final %>%
fit(data_train) %>%
pull_workflow_fit() %>%
vi(lambda = mejor_tuning$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, color = Sign, fill = Sign)) +
geom_col(alpha = 0.7) +
scale_color_manual(values = colores) +
scale_fill_manual(values = colores) +
scale_x_continuous(expand = c(0, 0)) +
labs(title = "Importancia de variables")== Workflow [trained] ====================================================================================
Preprocessor: Recipe
Model: logistic_reg()
-- Preprocessor ------------------------------------------------------------------------------------------
6 Recipe Steps
* step_knnimpute()
* step_BoxCox()
* step_other()
* step_dummy()
* step_normalize()
* step_upsample()
-- Model -------------------------------------------------------------------------------------------------
Call: glmnet::glmnet(x = as.matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.066880
2 1 0.22 0.060940
3 1 0.40 0.055520
4 1 0.55 0.050590
5 1 0.68 0.046100
6 1 0.79 0.042000
7 1 0.87 0.038270
8 1 0.95 0.034870
9 6 1.07 0.031770
10 6 1.29 0.028950
11 7 1.47 0.026380
12 8 1.65 0.024030
13 8 1.81 0.021900
14 9 1.96 0.019950
15 10 2.11 0.018180
16 11 2.25 0.016570
17 13 2.39 0.015090
18 13 2.52 0.013750
19 13 2.64 0.012530
20 14 2.74 0.011420
21 14 2.84 0.010400
22 15 2.92 0.009480
23 15 3.00 0.008638
24 15 3.07 0.007870
25 15 3.12 0.007171
26 16 3.17 0.006534
27 16 3.22 0.005954
28 16 3.25 0.005425
29 16 3.29 0.004943
30 17 3.31 0.004504
31 17 3.34 0.004104
32 17 3.36 0.003739
33 17 3.38 0.003407
34 18 3.39 0.003104
35 18 3.41 0.002828
36 18 3.42 0.002577
37 18 3.43 0.002348
38 18 3.44 0.002140
39 18 3.45 0.001950
40 18 3.46 0.001776
41 18 3.46 0.001619
42 18 3.47 0.001475
43 18 3.47 0.001344
44 18 3.48 0.001224
45 18 3.48 0.001116
46 19 3.48 0.001016
...
and 6 more lines.
predichos_train <- ajuste_final$fit$fit %>%
predict(new_data = juice(receta1_prep) %>% select(-Rating), type = "class") %>%
bind_cols(juice(receta1_prep) %>% select(Rating)) %>%
mutate_all(as.factor)
head(predichos_train)predichos_train %>%
conf_mat(Rating, .pred_class) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(x = Prediction, y = Truth, alpha = n)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)predichos_train %>%
metrics(Rating, .pred_class) %>%
select(-.estimator) %>%
filter(.metric == "accuracy") ajuste_final$fit$fit %>%
predict(new_data = juice(receta1_prep) %>% select(-Rating), type = "prob") %>%
bind_cols(juice(receta1_prep) %>% select(Rating)) %>%
roc_curve(Rating, .pred_0) %>%
autoplot()+
labs(title = "ROC - Train")predichos_test <- ajuste_final$fit$fit %>%
predict(new_data = test_baked %>% select(-Rating), type = "class") %>%
bind_cols(test_baked %>% select(Rating)) %>%
mutate_all(as.factor)
head(predichos_test)predichos_test %>%
conf_mat(Rating, .pred_class) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(x = Prediction, y = Truth, alpha = n)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)predichos_test %>%
metrics(Rating, .pred_class) %>%
select(-.estimator) %>%
filter(.metric == "accuracy") ajuste_final$fit$fit %>%
predict(new_data = test_baked %>% select(-Rating), type = "prob") %>%
bind_cols(test_baked %>% select(Rating)) %>%
roc_curve(Rating, .pred_0) %>%
autoplot() +
labs(title = "ROC - Test")#predicciones
predichos_final1 <- ajuste_final$fit$fit %>%
predict(new_data = data_sub, type = "class")
# Submission
sampleSub %>%
select(-rating) %>%
mutate(rating = predichos_final1$.pred_class) ->
sub_02_glmnet
head(sub_02_glmnet)